package com.ztesoft.test.service;

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;

import java.util.regex.Pattern;

import org.jsoup.nodes.Document;

/**
 * Crawl news from yahoo news
 *
 * @author hu
 */
public class YahooCrawler extends DeepCrawler {

	/**
	 * @param crawlPath
	 *            crawlPath is the path of the directory which maintains
	 *            information of this crawler
	 * @param autoParse
	 *            if autoParse is true,BreadthCrawler will auto extract links
	 *            which match regex rules from pag
	 */
	public YahooCrawler(String crawlPath, boolean autoParse) {
		super(crawlPath);
		/* start page */
		this.addSeed("http://yinchuan.58.com/jiefdj/baomu/?key=月嫂&ampcmcskey=月嫂&ampfinal=1&ampspecialtype=gls&ampPGTID=183655893188173277636398103&ampClickID=1&nearby=jiefdj");
		this.addSeed("http://yinchuan.58.com/jiefdj/baomu/pn2/?key=月嫂&ampcmcskey=月嫂&ampfinal=1&ampspecialtype=gls&&&nearby=jiefdj&PGTID=196080285188173285446113417&ClickID=1");

	}

	@Override
	public Links visitAndGetNextLinks(Page page) {
		String url = page.getUrl();
		/* if page is news page */
		if (Pattern.matches("http://yinchuan.58.com/jiefdj/baomu/.*", url)) {
			System.out.println(1);
			Links nextLinksa = new Links();
			/* 将所有搜索结果条目的超链接返回，爬虫会在下一层爬取中爬取这些链接 */
			nextLinksa.addAllFromDocument(page.getDoc(), "a[class=t]");
			return nextLinksa;
		} else {
			System.out.println(2);
			String a = page.getUrl();
			System.out.println(a);
		}
		return null;
	}

	public static void main(String[] args) throws Exception {
		YahooCrawler crawler = new YahooCrawler("yuesao", true);
		crawler.setThreads(50);
		// crawler.setTopN(100);
		crawler.start(2);
	}

}